Generating matrix of the rsem estimated_counts
# Loading results
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
setwd("C:\\Users\\mapanavici\\RStudio\\R")
Ov4Carbopool3 <- read.delim("WTCHG_626197_201106_.genes.results", header = TRUE)
Ovcar4Parental1 <- read.delim("WTCHG_626197_289105_.genes.results", header = TRUE)
Ovcar4Parental2 <- read.delim("WTCHG_626197_290117_.genes.results", header = TRUE)
Ovcar4Parental3 <- read.delim("WTCHG_626197_291129_.genes.results", header = TRUE)
Ov4Carbopool1 <- read.delim("WTCHG_626197_295177_.genes.results", header = TRUE)
Ov4Carbopool2 <- read.delim("WTCHG_626197_296189_.genes.results", header = TRUE)
# Obtaining "gene_id" and "expected_count" columns, renaming "expected_count" column to individual sample.names
get_counts <- function(sample_df) {
id_and_counts <- select(sample_df, gene_id, expected_count)
names(id_and_counts)[2] <- deparse(substitute(sample_df))
return(id_and_counts)
}
df1 <- get_counts(Ovcar4Parental1)
df2 <- get_counts(Ovcar4Parental2)
df3 <- get_counts(Ovcar4Parental3)
df4 <- get_counts(Ov4Carbopool1)
df5 <- get_counts(Ov4Carbopool2)
df6 <- get_counts(Ov4Carbopool3)
# Merging all data sets by "gene_id" column
rawCounts <- list(df1, df2, df3, df4, df5 ,df6) %>%
reduce(inner_join, by='gene_id')
Differential Expression Analysis
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
## Warning: package 'BiocManager' was built under R version 4.2.2
## Bioconductor version '3.15' is out-of-date; the current release version '3.16'
## is available with R version '4.2'; see https://bioconductor.org/install
BiocManager::install("DESeq2")
## Bioconductor version 3.15 (BiocManager 1.30.19), R 4.2.1 (2022-06-23 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'DESeq2'
## Old packages: 'bookdown', 'broom', 'bslib', 'cli', 'digest', 'doRNG',
## 'evaluate', 'formatR', 'highr', 'htmlwidgets', 'httpuv', 'isoband',
## 'maptools', 'nlme', 'purrr', 'rmarkdown', 'RSQLite', 'shiny', 'tidytree',
## 'tinytex', 'vctrs', 'xfun', 'yulab.utils'
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 4.2.2
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
# Tidying up the data for DESeq2
rownames(rawCounts) <- rawCounts$gene_id
rawCounts <- select(rawCounts, -1)
rawCounts <- ceiling(rawCounts) # converting floats to int for DESeq2
head(rawCounts)
# Inputting - Sample Groups
sampleData <- read.delim("sample_IDs.csv", header = T)
sampleData <- sampleData %>%
separate(sample.name.group, c("sample.name", "group"))
head(sampleData)
# Creating DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = rawCounts,
colData = sampleData,
design = ~ group)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds
## class: DESeqDataSet
## dim: 61541 6
## metadata(1): version
## assays(1): counts
## rownames(61541): ENSG00000000003 ENSG00000000005 ... ENSG00000289643
## ENSG00000289644
## rowData names(0):
## colnames(6): Ovcar4Parental1 Ovcar4Parental2 ... Ov4Carbopool2
## Ov4Carbopool3
## colData names(2): sample.name group
# Filtering out low count genes - allowing 10 counts in at least 3 samples
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep,]
# Running DESeq2 algo
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Using unsupervised clustering to see the relationship among two groups
# Data normalization
norm.counts <- counts(dds, normalized=TRUE)
write.csv(norm.counts, file="normalized_counts.txt", row.names=TRUE)
head(norm.counts)
## Ovcar4Parental1 Ovcar4Parental2 Ovcar4Parental3 Ov4Carbopool1
## ENSG00000000003 868.5490 1010.9747 987.0680 908.6342
## ENSG00000000419 1679.9055 1569.9057 1611.2002 1748.8173
## ENSG00000000457 152.1899 139.1760 164.3483 191.0009
## ENSG00000000460 485.6507 602.3539 689.6758 442.7246
## ENSG00000000971 491.4669 950.8507 805.1109 228.5386
## ENSG00000001036 1343.5367 1475.2660 1338.2646 1055.4731
## Ov4Carbopool2 Ov4Carbopool3
## ENSG00000000003 776.0136 881.1733
## ENSG00000000419 2344.5334 2164.6640
## ENSG00000000457 244.7828 188.4023
## ENSG00000000460 471.3371 553.4318
## ENSG00000000971 366.3062 877.2482
## ENSG00000001036 1120.6192 1533.7125
# Data transformation (using variance stabilizing transformations (VST) which produces transformed data on the log2 scale, normalized with respect to library size)
vsd <- vst(dds, blind=FALSE)
head(assay(vsd))
## Ovcar4Parental1 Ovcar4Parental2 Ovcar4Parental3 Ov4Carbopool1
## ENSG00000000003 9.929855 10.126994 10.095738 9.988132
## ENSG00000000419 10.804267 10.712576 10.747685 10.858869
## ENSG00000000457 7.966106 7.884723 8.037823 8.182649
## ENSG00000000460 9.205757 9.467683 9.636318 9.095802
## ENSG00000000971 9.220019 10.047051 9.832493 8.363318
## ENSG00000001036 10.503236 10.628728 10.497979 10.183420
## Ov4Carbopool2 Ov4Carbopool3
## ENSG00000000003 9.785521 9.948465
## ENSG00000000419 11.260481 11.150564
## ENSG00000000457 8.434630 8.169200
## ENSG00000000460 9.170033 9.363690
## ENSG00000000971 8.875983 9.942705
## ENSG00000001036 10.262225 10.681084
# Hierarchical clustering (dendrogram)
sampleDists <- dist(t(assay(vsd)))
plot(hclust(sampleDists, method = "complete"))
# PCA
plotPCA(vsd, intgroup=c("group"))
# Getting the results of differential expression analysis
res <- results(dds, contrast=c("group", "sensitive", "resistant"))
# Viewing summary of results
summary(res)
##
## out of 15495 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2244, 14%
## LFC < 0 (down) : 2024, 13%
## outliers [1] : 7, 0.045%
## low counts [2] : 0, 0%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#head(res)
#mcols(res)$description
# Diagnostics - observing how well the data has been captured by the model (plotting Dispersion Estimates)
plotDispEsts(dds)
# Graphical Visualizations of the DE genes – MA plot (shows the log2 fold changes of the genes (each dot is a gene) over the mean of normalized counts for all samples in our data.)
plotMA(res)
Positive log2 fold changes values represent upregulated genes in
sensitive compared to resistant group Negative log2 fold changes values
represent dowregulated genes in sensitive compared to resistant
group
# Graphical Visualizations of the differentially expressed genes, subseting the DE genes with padj < 0.05 and log2FC > |1| i.e. at least two-fold change in expression
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.2.2
de.sign <- subset(res, padj < 0.05 & abs(log2FoldChange) >1 )
de.sign.genes <- rownames(de.sign)
scale.dat <- t(scale(t(assay(vsd)[de.sign.genes,])))
pheatmap(scale.dat[de.sign.genes,], cluster_rows=TRUE, show_rownames=FALSE, cluster_cols=TRUE)
# Annotation of output tables
BiocManager::install('AnnotationDbi')
## Bioconductor version 3.15 (BiocManager 1.30.19), R 4.2.1 (2022-06-23 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'AnnotationDbi'
## Old packages: 'bookdown', 'broom', 'bslib', 'cli', 'digest', 'doRNG',
## 'evaluate', 'formatR', 'highr', 'htmlwidgets', 'httpuv', 'isoband',
## 'maptools', 'nlme', 'purrr', 'rmarkdown', 'RSQLite', 'shiny', 'tidytree',
## 'tinytex', 'vctrs', 'xfun', 'yulab.utils'
library(AnnotationDbi)
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
library(org.Hs.eg.db)
##
res.df <- as.data.frame(res)
res.annot <- mapIds(org.Hs.eg.db, keys = rownames(res.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
res.df.annot <- cbind(res.df,gene.symbol = res.annot) #binding the annotations to the sign.df
head(res.df.annot)
write.csv(res.df.annot, file = "DE_sensitiveVSresistant_results_wGeneSymbol.csv")
# Using volcano plot to visualize the Log2Fold changes of all genes (again each dot represents a gene) plotted against the -Log10 Adjusted pvalue.
# In red are genes with padj < 0.05 and log2fold change more or less than 1. They are either differently upregulated (right) or downregulated (left) in sensitive samples compare to resistant.
BiocManager::install('EnhancedVolcano')
## Bioconductor version 3.15 (BiocManager 1.30.19), R 4.2.1 (2022-06-23 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'EnhancedVolcano'
## Old packages: 'bookdown', 'broom', 'bslib', 'cli', 'digest', 'doRNG',
## 'evaluate', 'formatR', 'highr', 'htmlwidgets', 'httpuv', 'isoband',
## 'maptools', 'nlme', 'purrr', 'rmarkdown', 'RSQLite', 'shiny', 'tidytree',
## 'tinytex', 'vctrs', 'xfun', 'yulab.utils'
library(EnhancedVolcano)
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.2.2
EnhancedVolcano(res.df.annot,
lab = res.df.annot$gene.symbol,
x = 'log2FoldChange',
y = 'padj')
Gene set enrichment analysis
# Performing Pre-Ranked Gene Set Enrichment Analysis (GSEA Pre-Ranked) to understand better what biological processes and pathways are affected
library(fgsea)
library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
##
## Attaching package: 'data.table'
## The following object is masked from 'package:SummarizedExperiment':
##
## shift
## The following object is masked from 'package:GenomicRanges':
##
## shift
## The following object is masked from 'package:IRanges':
##
## shift
## The following objects are masked from 'package:S4Vectors':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2)
library(qusage)
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
# Loading DE results
DE.res <- read.csv("DE_sensitiveVSresistant_results_wGeneSymbol.csv", header = TRUE, row.names = 1)
DE.res
# Loading the Gene Ontology Collection
gmt.file <- read.gmt("c5.go.bp.v7.4.symbols.gmt")
# Ranking DE results by decreasing Log2 Fold-change and then creating a named vector of the log2 fold changes with the names being the gene symbols.
DE.res.ranked <- DE.res[order(DE.res$log2FoldChange, decreasing = T), ]
DE.ranks <- setNames(DE.res.ranked$log2FoldChange, DE.res.ranked$gene.symbol)
# Running the fgsea function
fgseaRes <- fgsea(gmt.file, DE.ranks, minSize=15, maxSize=500)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.06% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
head(fgseaRes)
# Enrichment Plot
topPathways <- fgseaRes[order(padj)]
topPathways
res
## log2 fold change (MLE): group sensitive vs resistant
## Wald test p-value: group sensitive vs resistant
## DataFrame with 15495 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 905.402 0.160219 0.163600 0.979334 0.3274152
## ENSG00000000419 1853.171 -0.364759 0.170455 -2.139908 0.0323622
## ENSG00000000457 179.983 -0.454814 0.229966 -1.977749 0.0479570
## ENSG00000000460 540.862 0.276303 0.206161 1.340228 0.1801711
## ENSG00000000971 619.920 0.609931 0.422748 1.442776 0.1490835
## ... ... ... ... ... ...
## ENSG00000289604 335.8182 0.429061 0.269354 1.59292 0.1111771
## ENSG00000289609 189.7031 0.967793 0.498021 1.94328 0.0519826
## ENSG00000289613 24.0473 -0.546796 0.477237 -1.14575 0.2518966
## ENSG00000289614 45.3963 -0.407616 0.394871 -1.03227 0.3019434
## ENSG00000289626 11.8163 -0.933826 0.714042 -1.30780 0.1909404
## padj
## <numeric>
## ENSG00000000003 0.536898
## ENSG00000000419 0.111981
## ENSG00000000457 0.148879
## ENSG00000000460 0.369557
## ENSG00000000971 0.326394
## ... ...
## ENSG00000289604 0.266715
## ENSG00000289609 0.157709
## ENSG00000289613 0.454918
## ENSG00000289614 0.509589
## ENSG00000289626 0.383416
Pathway Enrichment Analysis
# selecting sign. DE genes
library("AnnotationDbi")
library("org.Hs.eg.db")
# adding symbol column for further pathway analysis
res$symbol = mapIds(org.Hs.eg.db,
keys=row.names(res),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
sig_genes <- res[res$padj <= 0.05 & !is.na(res$padj), "symbol"]
print(paste("Total number of significant genes:", length(sig_genes)))
## [1] "Total number of significant genes: 3442"
write.table(sig_genes, file="significant_genes.txt", row.names=FALSE, col.names=FALSE, quote=FALSE)
Performing pathway analysis online where I uploaded “significant_genes.txt” to Reactome website (https://reactome.org/PathwayBrowser/#TOOL=AT). Selectied parameters:“Project to Humans”
reactome_res <- read.csv("reactome_result.csv")
reactome_res
# top dysregulated pathways
reactome_res %>% dplyr::select(2, 6)
Gene Ontology Biological Pathways (GOBP) related to the tumor micro-environment, particularly the extracellular matrix, were enriched in carboplatin-resistant cells